The code for this report is on github here.
> library(useful)
> library(readr)
> library(dplyr)
> library(tidyr)
> library(ggplot2)
> library(stringr)
Read in the data for each plate into one big dataframe, make unique IDs for each batch/well combination and use those IDs as the column names.
> plate_reader = function(fn) {
+ batch = strsplit(basename(fn), ".", fixed = TRUE)[[1]][1]
+ data = read.table(fn, header = TRUE, row.names = 1)
+ colnames(data) = paste(batch, colnames(data), sep = "_")
+ data
+ }
> plates_fns = sort(list.files("data", pattern = "*.dat", full.names = TRUE))
> plates = do.call(cbind, lapply(plates_fns, plate_reader))
Create a dataframe of the metadata about each sample. This has an identifier for a sample, Which well it came from, which batch and what it was treated with.
> welldata_fn = "metadata/Compound Layout 384w.csv"
> welldata = read_csv(welldata_fn) %>% gather(column, treatment, -row) %>% mutate(well = paste(row,
+ column, sep = "")) %>% dplyr::select(well, treatment)
> identities = data.frame(str_split_fixed(colnames(plates), "_", 3))
> colnames(identities) = c("batch", "drop", "well")
> identities$id = colnames(plates)
> welldata = identities %>% dplyr::select(batch, well) %>% left_join(welldata,
+ by = "well")
> rownames(welldata) = colnames(plates)
> welldata$sample = colnames(plates)
> welldata$classes = ifelse(welldata$treatment == "DMSO", "DMSO", ifelse(welldata$treatment ==
+ "AdMyoD", "AdMyoD", "other"))
Verify that the samples match up between the read counts and the metadata dataframe and make sure there are no NA counts.
> dim(welldata)
[1] 1920 5
> dim(plates)
[1] 23481 1920
> table(rownames(welldata) %in% colnames(plates))
TRUE
1920
> corner(plates)
M1_T384s1_A1 M1_T384s1_A2 M1_T384s1_A3 M1_T384s1_A4
0610005C13Rik 0 0 0 0
0610007N19Rik 2 5 2 12
0610007P14Rik 1 4 0 14
0610008F07Rik 0 0 0 0
0610009B14Rik 0 0 0 0
M1_T384s1_A5
0610005C13Rik 0
0610007N19Rik 9
0610007P14Rik 8
0610008F07Rik 0
0610009B14Rik 0
> corner(welldata)
batch well treatment sample classes
M1_T384s1_A1 M1 A1 DMSO M1_T384s1_A1 DMSO
M1_T384s1_A2 M1 A2 ABT-263 (Navitoclax) M1_T384s1_A2 other
M1_T384s1_A3 M1 A3 TG100-115 M1_T384s1_A3 other
M1_T384s1_A4 M1 A4 Lenalidomide (CC-5013) M1_T384s1_A4 other
M1_T384s1_A5 M1 A5 Oxcarbazepine M1_T384s1_A5 other
> table(complete.cases(plates))
TRUE
23481
Looks like we are good to go. While we’re at it we will load the positive control data.
> positive_fn = "data/Feo_positive_controls.unq.refseq.umi.SC_and_Myoblast_Raw.txt"
> positive = read.table(positive_fn, header = TRUE, row.names = rownames(plates))
> positive$id = NULL
> positive_samples = data.frame(str_split_fixed(colnames(positive), "_", 3))
> identities = data.frame(str_split_fixed(colnames(plates), "_", 3))$X1
> positive_welldata = data.frame(batch = positive_samples$X1, treatment = positive_samples$X1,
+ well = positive_samples$X1)
> rownames(positive_welldata) = colnames(positive)
Now we will calculate some summary statistics about each sample.
> welldata$genes_detected = colSums(plates > 0)
> welldata$genes_detected_zscore = ave(welldata$genes_detected, FUN = scale)
> welldata$genes_detected_pval = 2 * pnorm(-abs(welldata$genes_detected_zscore))
> welldata$genes_detected_padj = p.adjust(welldata$genes_detected_pval, method = "BH")
We see around 10k genes detected in each cell on each plate, with some cells with a low number of genes detected on each plate.
> ggplot(welldata, aes(batch, genes_detected)) + geom_boxplot() + ylab("genes with counts > 0") +
+ xlab("") + theme_bw()
> welldata$counts = colSums(plates)
> ggplot(welldata, aes(batch, counts)) + geom_boxplot() + ylab("total counts") +
+ xlab("") + theme_bw()
> ggplot(welldata, aes(counts, genes_detected, color = batch)) + geom_smooth(fill = NA) +
+ ylab("genes with counts > 0") + xlab("total counts") + theme_bw()
In this histogram of the genes detected, we can see there are a set of cells with a low amount of genes detected, more in the M2 plate than the other plates but for the most part the plates look pretty similar.
> ggplot(welldata, aes(genes_detected)) + geom_histogram() + theme_bw() + xlab("genes with counts > 0") +
+ facet_wrap(~batch)
Here we drop the cells with less than 7,500 genes detected.
> welldata = subset(welldata, genes_detected > 7500)
> plates = plates[, rownames(welldata)]
> library(biomaRt)
> mouse = useMart(biomart = "ENSEMBL_MART_ENSEMBL", dataset = "mmusculus_gene_ensembl",
+ host = "jul2015.archive.ensembl.org")
> conversions = getBM(attributes = c("ensembl_gene_id", "mgi_symbol", "gene_biotype"),
+ mart = mouse)
Samples do not have very many counts in noise genes, so that is not an issue. Noise genes are flagged by either being too small to be picked up reliably in a standard RNA-seq analysis or are highly variable and prone to introducing distortion such as rRNA.
> biotypes = unique(conversions$gene_biotype)
> noise_rna_biotypes = c("Mt_tRNA", "Mt_rRNA", "snoRNA", "snRNA", "misc_RNA",
+ "scaRNA", "rRNA", "sRNA")
> noise_rna_genes = subset(conversions, gene_biotype %in% noise_rna_biotypes)$mgi_symbol
> noise_rna = rownames(plates)[rownames(plates) %in% noise_rna_genes]
> welldata$noise_counts = colSums(plates[noise_rna, ])
> ggplot(welldata, aes(batch, noise_counts)) + geom_boxplot() + ylab("counts in noise genes") +
+ xlab("") + theme_bw()
We’ll drop the noise genes from consideration even though there aren’t many counts in them.
> plates = plates[!rownames(plates) %in% noise_rna_genes, ]
We’ll also drop all genes with that don’t have at least 100 counts total and are not seen in at least 4 samples. This cuts down the number of genes we are considering to ~13,500.
> plates = plates[rowSums(plates > 0) > 4 & rowSums(plates) > 100, ]
This is what we’re left with in terms of samples:
> knitr::kable(welldata %>% group_by(batch) %>% summarize(total = n()))
| batch | total |
|---|---|
| M1 | 302 |
| M2 | 295 |
| M3 | 369 |
| M4 | 359 |
| M5 | 358 |
and we’re left with 13488 genes to consider.
> library(Seurat)
> seurat.data = new("seurat", raw.data = plates)
> seurat.data = Setup(seurat.data, project = "rubin", min.cells = 3, min.genes = 1000,
+ meta.data = welldata, total.expr = 10000)
[1] "Performing log-normalization"
[1] "Scaling data matrix"
|
| | 0%
|
|===== | 7%
|
|========= | 14%
|
|============== | 21%
|
|=================== | 29%
|
|======================= | 36%
|
|============================ | 43%
|
|================================ | 50%
|
|===================================== | 57%
|
|========================================== | 64%
|
|============================================== | 71%
|
|=================================================== | 79%
|
|======================================================== | 86%
|
|============================================================ | 93%
|
|=================================================================| 100%
Here we look at what are the most variable genes across the samples. We can see a lot of these are subunits of ribosomal proteins, these will be used for the PCA.
> seurat.data = MeanVarPlot(seurat.data, y.cutoff = 0.5, x.low.cutoff = 0.0125,
+ x.high.cutoff = 3, fxn.x = expMean, fxn.y = logVarDivMean)
[1] "Calculating gene dispersion"
|
| | 0%
|
|===== | 7%
|
|========= | 14%
|
|============== | 21%
|
|=================== | 29%
|
|======================= | 36%
|
|============================ | 43%
|
|================================ | 50%
|
|===================================== | 57%
|
|========================================== | 64%
|
|============================================== | 71%
|
|=================================================== | 79%
|
|======================================================== | 86%
|
|============================================================ | 93%
|
|=================================================================| 100%
We’ll focus on component 1 since that seems to be what separates out the batch 3 plate from the other plates.
> seurat.data = PCA(seurat.data, do.print = FALSE)
> PCAPlot(seurat.data, 1, 2, pt.size = 2)
> rot = seurat.data@pca.rot %>% tibble::rownames_to_column(var = "sample") %>%
+ left_join(welldata, by = "sample")
> ggplot(rot, aes(PC1, PC2, shape = classes, color = batch)) + geom_point(size = 2) +
+ theme_bw()
There are quite a few ribosomal proteins that are flagged. These should for the most part eithr be stably expressed or are not particularly interesting hits.
Since TCL3 is so different, and it seems different in non-interesting ways, let’s just drop it for now to make our lives easier. We can see that after dropping TCL3 we see a separation of the AdMyoD samples from the other samples.
> welldata = subset(welldata, batch != "M3")
> plates = plates[, rownames(welldata)]
> seurat.data = new("seurat", raw.data = plates)
> seurat.data = Setup(seurat.data, project = "rubin", min.cells = 3, min.genes = 200,
+ total.expr = 10000, meta.data = welldata)
[1] "Performing log-normalization"
[1] "Scaling data matrix"
|
| | 0%
|
|===== | 7%
|
|========= | 14%
|
|============== | 21%
|
|=================== | 29%
|
|======================= | 36%
|
|============================ | 43%
|
|================================ | 50%
|
|===================================== | 57%
|
|========================================== | 64%
|
|============================================== | 71%
|
|=================================================== | 79%
|
|======================================================== | 86%
|
|============================================================ | 93%
|
|=================================================================| 100%
> seurat.data = MeanVarPlot(seurat.data, y.cutoff = 0.5, x.low.cutoff = 0.0125,
+ x.high.cutoff = 3, do.contour = F, fxn.x = expMean, fxn.y = logVarDivMean)
[1] "Calculating gene dispersion"
|
| | 0%
|
|===== | 7%
|
|========= | 14%
|
|============== | 21%
|
|=================== | 29%
|
|======================= | 36%
|
|============================ | 43%
|
|================================ | 50%
|
|===================================== | 57%
|
|========================================== | 64%
|
|============================================== | 71%
|
|=================================================== | 79%
|
|======================================================== | 86%
|
|============================================================ | 93%
|
|=================================================================| 100%
> seurat.data = PCA(seurat.data, do.print = FALSE)
> PCAPlot(seurat.data, 1, 2, pt.size = 2)
> seurat.data = ProjectPCA(seurat.data, do.print = FALSE)
|
| | 0%
|
|===== | 7%
|
|========= | 14%
|
|============== | 21%
|
|=================== | 29%
|
|======================= | 36%
|
|============================ | 43%
|
|================================ | 50%
|
|===================================== | 57%
|
|========================================== | 64%
|
|============================================== | 71%
|
|=================================================== | 79%
|
|======================================================== | 86%
|
|============================================================ | 93%
|
|=================================================================| 100%
> VizPCA(seurat.data, pcs.use = 1:3, num.genes = 10, use.full = TRUE, nCol = 3)
> rot = seurat.data@pca.rot %>% tibble::rownames_to_column(var = "sample") %>%
+ left_join(welldata, by = "sample")
> ggplot(rot, aes(PC1, PC2, color = batch, label = treatment)) + geom_text(size = 2) +
+ theme_bw()
We can see a batch effect in PC3 and PC4, which shows that we should make sure that we are controlling for batch effects when we do thes experiments.
> ggplot(rot, aes(PC3, PC4, color = batch, label = treatment)) + geom_text(size = 2) +
+ theme_bw()
Going back to PC1 and PC2, we see there are some chemicals that seem to move the cells towards the AdMyoD treated cells. Zooming in on that area shows some more candidates:
> ggplot(rot, aes(PC1, PC2, color = batch, label = treatment)) + coord_cartesian(xlim = c(-30,
+ 0), ylim = c(-25, 0)) + geom_text(size = 2) + theme_bw()
Here we take a different tactic and find the genes that are most different between the AdMyoD samples and the DMSO treated samples. Then we will use those genes to measure how far each of the treated samples is from the AdMyoD samples in terms of expression of those genes.
> md_samples = rownames(subset(welldata, treatment %in% c("AdMyoD", "DMSO")))
> md_metadata = welldata[md_samples, ]
> md_counts = plates[, md_samples]
We fit a model that tests for differences between AdMyoD and DMSO treatment while controlling for batch effects.
> library(DESeq2)
> dds = DESeqDataSetFromMatrix(countData = md_counts, colData = md_metadata, design = ~batch +
+ treatment)
> dds = DESeq(dds)
> plotDispEsts(dds)
> res = results(dds)
> sig = subset(res, padj < 0.05) %>% data.frame() %>% tibble::rownames_to_column(var = "gene") %>%
+ dplyr::arrange(padj)
There are 1524 genes tagged as differentially expressed between the AdMyoD and the DMSO treated samples. We’ll use those genes to measure how far the chemical treated samples are from the AdMyoD treated samples. This gives different results than the PCA method we were using:
> comp1class = welldata[, c("sample", "treatment", "classes")]
> colnames(comp1class) = c("sample", "comp1treat", "comp1class")
> comp2class = welldata[, c("sample", "treatment", "classes")]
> colnames(comp2class) = c("sample", "comp2treat", "comp2class")
> dds = DESeqDataSetFromMatrix(countData = plates, colData = welldata, design = ~batch +
+ treatment)
> dds = estimateSizeFactors(dds)
> ncounts = log(counts(dds, normalized = TRUE) + 1)
> dists = as.matrix(dist(t(ncounts[sig$gene, ])))
> dists[diag(dists)] = NA
> dists = dists %>% data.frame() %>% na.omit() %>% tibble::rownames_to_column() %>%
+ tidyr::gather(sample, distance, -rowname)
> colnames(dists) = c("comp1", "comp2", "distance")
> dists = dists %>% left_join(comp1class, by = c(comp1 = "sample")) %>% left_join(comp2class,
+ by = c(comp2 = "sample")) %>% group_by(comp1treat, comp2treat) %>% summarise(mtreatdist = mean(distance),
+ mtreatsd = sd(distance))
> myod = subset(dists, comp1treat == "AdMyoD") %>% dplyr::arrange(mtreatdist) %>%
+ mutate(rank = dense_rank(mtreatdist))
> ggplot(myod, aes(rank, mtreatdist, label = comp2treat)) + geom_point() + theme(axis.title.x = element_blank(),
+ axis.text.x = element_blank(), axis.ticks.x = element_blank()) + ylab("Euclidean distance to AdMyoD")
> write_csv(myod, "normalized-count-distance-to-admyod.csv")
Another way is to just look at the We can also calculate the distances between points on PC1 and PC2 to get an estimate of how similar the samples are to each other.
> pcamat = as.matrix(rot[, c("PC1", "PC2")])
> rownames(pcamat) = rot$sample
> pcadist = as.matrix(dist(pcamat))
> pcadist[diag(pcadist)] = NA
> pcadist = pcadist %>% data.frame() %>% na.omit() %>% tibble::rownames_to_column() %>%
+ tidyr::gather(sample, distance, -rowname)
> colnames(pcadist) = c("comp1", "comp2", "distance")
> pcadist = pcadist %>% left_join(comp1class, by = c(comp1 = "sample")) %>% left_join(comp2class,
+ by = c(comp2 = "sample")) %>% group_by(comp1treat, comp2treat) %>% summarise(mtreatdist = mean(distance),
+ mtreatsd = sd(distance))
> myod = subset(pcadist, comp1treat == "AdMyoD") %>% dplyr::arrange(mtreatdist) %>%
+ mutate(rank = dense_rank(mtreatdist))
> ggplot(myod, aes(rank, mtreatdist, label = comp2treat)) + geom_point() + theme(axis.title.x = element_blank(),
+ axis.text.x = element_blank(), axis.ticks.x = element_blank()) + ylab("Euclidean distance to AdMyoD")
> write_csv(myod, "pca-distance-to-admyod.csv")
> comp1class = welldata[, c("sample", "treatment", "classes", "batch")]
> colnames(comp1class) = c("sample", "comp1treat", "comp1class", "batch")
> comp2class = welldata[, c("sample", "treatment", "classes", "batch")]
> colnames(comp2class) = c("sample", "comp2treat", "comp2class", "batch")
> pcamat = as.matrix(rot[, c("PC1", "PC2")])
> rownames(pcamat) = rot$sample
> pcadist = as.matrix(dist(pcamat))
> pcadist[diag(pcadist)] = NA
> pcadist = pcadist %>% data.frame() %>% na.omit() %>% tibble::rownames_to_column() %>%
+ tidyr::gather(sample, distance, -rowname)
> colnames(pcadist) = c("comp1", "comp2", "distance")
> pcadist = pcadist %>% left_join(comb1class, by = c(comp1 = "sample")) %>% left_join(comb2class,
+ by = c(comp2 = "sample")) %>% group_by(comb1treat, comb2treat, batch.y) %>%
+ summarise(mtreatdist = mean(distance), mtreatsd = sd(distance))
> scall = subset(pcadist, comb1treat == "AdMyoD") %>% dplyr::arrange(mtreatdist)
> write_csv(scall, "pca-distance-to-admyod-by-batch.csv")
Here we can see along PC1 a separation of the SC and Myo samples, the AdMyoD separation is along component 2. We have no way of determining though if this separation is due to batch differences or due to actual biological signal of SC and Myo.
> positive = positive[rownames(plates), ]
> combwelldata = rbind(positive_welldata, welldata[, colnames(positive_welldata)])
> combwelldata$classes = ifelse(combwelldata$treatment == "DMSO", "DMSO", ifelse(welldata$treatment ==
+ "SC", "SC", "other"))
> combwelldata$sample = rownames(combwelldata)
> combwell = cbind(positive, plates)
> library(Seurat)
> combined.data = new("seurat", raw.data = combwell)
> combined.data = Setup(combined.data, project = "rubin", min.cells = 3, min.genes = 1000,
+ meta.data = combwelldata, total.expr = 10000)
[1] "Performing log-normalization"
[1] "Scaling data matrix"
|
| | 0%
|
|===== | 7%
|
|========= | 14%
|
|============== | 21%
|
|=================== | 29%
|
|======================= | 36%
|
|============================ | 43%
|
|================================ | 50%
|
|===================================== | 57%
|
|========================================== | 64%
|
|============================================== | 71%
|
|=================================================== | 79%
|
|======================================================== | 86%
|
|============================================================ | 93%
|
|=================================================================| 100%
> combined.data = MeanVarPlot(combined.data, y.cutoff = 0.5, x.low.cutoff = 0.0125,
+ x.high.cutoff = 3, do.contour = F, fxn.x = expMean, fxn.y = logVarDivMean)
[1] "Calculating gene dispersion"
|
| | 0%
|
|===== | 7%
|
|========= | 14%
|
|============== | 21%
|
|=================== | 29%
|
|======================= | 36%
|
|============================ | 43%
|
|================================ | 50%
|
|===================================== | 57%
|
|========================================== | 64%
|
|============================================== | 71%
|
|=================================================== | 79%
|
|======================================================== | 86%
|
|============================================================ | 93%
|
|=================================================================| 100%
> combined.data = PCA(combined.data, do.print = FALSE)
> PCAPlot(combined.data, 1, 2, pt.size = 2)
> combined.data = ProjectPCA(combined.data, do.print = FALSE)
|
| | 0%
|
|===== | 7%
|
|========= | 14%
|
|============== | 21%
|
|=================== | 29%
|
|======================= | 36%
|
|============================ | 43%
|
|================================ | 50%
|
|===================================== | 57%
|
|========================================== | 64%
|
|============================================== | 71%
|
|=================================================== | 79%
|
|======================================================== | 86%
|
|============================================================ | 93%
|
|=================================================================| 100%
> rot = combined.data@pca.rot %>% tibble::rownames_to_column(var = "sample") %>%
+ left_join(combwelldata, by = "sample")
> ggplot(rot, aes(PC1, PC2, color = batch, label = treatment)) + geom_text(size = 2) +
+ theme_bw()
If it is not possible to include SC and Myo samples on the single-cell plate, we could also correct for batch if we had a set of DMSO treated samples on the SC and Myo plates, but we need some overlapping samples so we can correct for the batch effect.
> comb1class = combwelldata[, c("sample", "treatment", "classes", "batch")]
> colnames(comb1class) = c("sample", "comb1treat", "comb1class", "batch")
> comb2class = combwelldata[, c("sample", "treatment", "classes", "batch")]
> colnames(comb2class) = c("sample", "comb2treat", "comb2class", "batch")
> pcamat = as.matrix(rot[, c("PC1", "PC2")])
> rownames(pcamat) = rot$sample
> pcadist = as.matrix(dist(pcamat))
> pcadist[diag(pcadist)] = NA
> pcadist = pcadist %>% data.frame() %>% na.omit() %>% tibble::rownames_to_column() %>%
+ tidyr::gather(sample, distance, -rowname)
> colnames(pcadist) = c("comp1", "comp2", "distance")
> pcadist = pcadist %>% left_join(comb1class, by = c(comp1 = "sample")) %>% left_join(comb2class,
+ by = c(comp2 = "sample")) %>% group_by(comb1treat, comb2treat) %>% summarise(mtreatdist = mean(distance),
+ mtreatsd = sd(distance))
> sc = subset(pcadist, comb1treat == "SC") %>% dplyr::arrange(mtreatdist) %>%
+ mutate(rank = dense_rank(mtreatdist))
> ggplot(sc, aes(rank, mtreatdist, label = comb2treat)) + geom_point() + theme(axis.title.x = element_blank(),
+ axis.text.x = element_blank(), axis.ticks.x = element_blank()) + ylab("Euclidean distance to SC")
> write_csv(sc, "pca-distance-to-SC.csv")
> comb1class = combwelldata[, c("sample", "treatment", "classes", "batch")]
> colnames(comb1class) = c("sample", "comb1treat", "comb1class", "batch")
> comb2class = combwelldata[, c("sample", "treatment", "classes", "batch")]
> colnames(comb2class) = c("sample", "comb2treat", "comb2class", "batch")
> pcamat = as.matrix(rot[, c("PC1", "PC2")])
> rownames(pcamat) = rot$sample
> pcadist = as.matrix(dist(pcamat))
> pcadist[diag(pcadist)] = NA
> pcadist = pcadist %>% data.frame() %>% na.omit() %>% tibble::rownames_to_column() %>%
+ tidyr::gather(sample, distance, -rowname)
> colnames(pcadist) = c("comp1", "comp2", "distance")
> pcadist = pcadist %>% left_join(comb1class, by = c(comp1 = "sample")) %>% left_join(comb2class,
+ by = c(comp2 = "sample")) %>% group_by(comb1treat, comb2treat, batch.y) %>%
+ summarise(mtreatdist = mean(distance), mtreatsd = sd(distance))
> scall = subset(pcadist, comb1treat == "SC") %>% dplyr::arrange(mtreatdist)
> write_csv(scall, "pca-distance-to-SC-by-batch.csv")